Exploratory Data Analysis is about: - univariate non-graphical EDA - univariate graphical EDA - multivariate non-graphical EDA - multivariate graphical EDA
Methods to explore:
- Variation (describe behavior within variable)
- Covariation (describe behavior between variables)
- Missing data / anomalies
- Patterns / correlations
Tools: - tables
- scatterplot
- histogram
- boxplot
- heatplot and hex plots
Pull in new dataset.
Dataset from Department of Labor about applications for permanent employment certification. Download the “PERM” FY 2021 Disclosure file (xlsx) and put in folder /lessons/data.
library(tidyverse)
library(readxl)
library(gt)
library(gtExtras) #remotes::install_github("jthomasmock/gtExtras")
# in some cases, we need to change guess_max because the column may have nulls for a lot of the first rows
df = read_excel("lessons/data/PERM_Disclosure_Data_FY2021_Q3.xlsx",guess_max=15000)
head(df,2) %>% as.data.frame()
Do general data checks to understand the data you are dealing with.
# validate if all cases are distinct
df %>% distinct(CASE_NUMBER) %>% nrow() == nrow(df)
## [1] TRUE
# look at received yr
df %>% mutate(received_yr=lubridate::year(RECEIVED_DATE)) %>% group_by(received_yr) %>% count() %>% ungroup()
# look at decision yr
df %>% mutate(decision_yr=lubridate::year(DECISION_DATE)) %>% group_by(decision_yr) %>% count() %>% ungroup()
# look at employers
df %>% group_by(EMPLOYER_NAME) %>% count() %>% ungroup() %>% arrange(desc(n))
Now, we can make some interesting graphics / tables to describe trends.
summary(df$PW_WAGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 7 67413 94307 91555 116334 6169300 10
wages = df %>%
group_by(PW_WAGE) %>% count() %>% ungroup()
wages %>%
ggplot(aes(x=PW_WAGE,y=n)) + geom_bar(stat='identity')
## Warning: Removed 1 rows containing missing values (position_stack).
wages %>%
ggplot(aes(x=PW_WAGE,y=n)) + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
df %>%
group_by(PW_WAGE) %>% count() %>% ungroup() %>%
ggplot(aes(x=PW_WAGE,y=n)) + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
df %>%
group_by(PW_WAGE) %>% count() %>% ungroup() %>%
ggplot(aes(x=PW_WAGE,y=n)) + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
top_10_employers = df %>% group_by(EMPLOYER_NAME) %>% count() %>% ungroup() %>% arrange(desc(n)) %>% head(10)
df_top_10 = df %>% dplyr::filter(EMPLOYER_NAME %in% top_10_employers$EMPLOYER_NAME)
gg_all = df %>%
ggplot(aes(x=PW_WAGE)) +
geom_histogram(breaks=seq(0,300000,by=10000),fill="black",colour="white") +
scale_x_continuous(name="Wage",breaks=seq(0,300000,by=50000),labels=scales::comma) +
scale_y_continuous(name="Applicants w/ Wage",labels=scales::comma) +
theme_bw() +
ggtitle("Wages per Applicant (2020-2021)")
gg_all
## Warning: Removed 10 rows containing non-finite values (stat_bin).
gg_top10 = df %>%
dplyr::filter(EMPLOYER_NAME %in% top_10_employers$EMPLOYER_NAME) %>%
ggplot(aes(x=PW_WAGE)) +
geom_histogram(breaks=seq(0,300000,by=10000),fill="black",colour="white") +
scale_x_continuous(name="Wage",breaks=seq(0,300000,by=50000),labels=scales::comma) +
scale_y_continuous(name="Applicants w/ Wage",labels=scales::comma) +
theme_bw() +
ggtitle("Wages per Applicant (2020-2021)")
ggplot() +
geom_histogram(data = df,mapping=aes(x=PW_WAGE),breaks=seq(0,300000,by=10000),fill="grey",colour="white") +
geom_histogram(data = df_top_10,mapping=aes(x=PW_WAGE),breaks=seq(0,300000,by=10000),fill="black",colour="white") +
scale_x_continuous(name="Wage",breaks=seq(0,300000,by=50000),labels=scales::comma) +
scale_y_continuous(name="Applicants w/ Wage",labels=scales::comma) +
theme_bw() +
ggtitle(label = "Wages per Applicant (2020-2021)", subtitle = "Black represents top 10 employers")
## Warning: Removed 10 rows containing non-finite values (stat_bin).
df_top_10 = df %>% dplyr::filter(EMPLOYER_NAME %in% top_10_employers$EMPLOYER_NAME)
top_paid = df %>% arrange(desc(PW_WAGE)) %>% top_frac(0.1,wt = PW_WAGE) %>%
group_by(EMPLOYER_NAME) %>% summarise(n=n(),sum=sum(PW_WAGE,na.rm=TRUE),max=max(PW_WAGE,na.rm=TRUE),med=stats::median(PW_WAGE,na.rm=TRUE)) %>% ungroup() %>%
arrange(desc(n)) %>% head(10)
df %>% dplyr::filter(EMPLOYER_NAME %in% top_paid$EMPLOYER_NAME)